Helper Functions

This section details the helper function that were used to reduce come of the repetition during exploratory data analysis.

# Create a histogram of the variable with annotations for the mean and outliers
generate_histogram <- function(df, col, title, x) {
  ggplot(df, aes(x = col)) + geom_histogram() + 
    geom_vline(aes(xintercept = mean(col)), color = "red") + 
    geom_vline(aes(xintercept = mean(col)  - 3 * sd(col)), color = "blue") + 
    geom_vline(aes(xintercept = mean(col)  +  3 * sd(col)), color = "blue") + 
    labs(title=title, x = x, y = "Occurrences")
}

generate_boxplot <- function(df, col, title, x) {
  ggplot(df, aes(x = col)) + geom_boxplot() + 
    labs(title=title, x = x, y = "Occurrences")
}

generate_class_histogram_facet <- function(df, col, title, x){
  ggplot(df, aes(x = col, fill = CLASS)) +
    geom_histogram() + coord_flip() +
    facet_grid(~ CLASS) +
    labs(title=title, x = x, y = "Occurrences")
}


generate_class_boxplot_facet <- function(df, col, title, x){
  ggplot(df, aes(x = col, fill = CLASS)) + 
    geom_boxplot() + coord_flip() + 
    facet_grid(~ CLASS) +
    labs(title=title, x = x)
}

generate_nsp_histogram_facet <- function(df, col, title, x){
  ggplot(df, aes(x = col, fill = NSP)) +
    geom_histogram() + coord_flip() +
    facet_grid(~ NSP) +
    labs(title=title, x = x, y = "Occurrences")
}


generate_nsp_boxplot_facet <- function(df, col, title, x){
  ggplot(df, aes(x = col, fill = NSP)) + 
    geom_boxplot() + coord_flip() + 
    facet_grid(~ NSP) +
    labs(title=title, x = x)
}



return_outliers <- function(data) {
  upper_limit <- mean(data) + 3 * sd(data)
  lower_limit <- mean(data) - 3 * sd(data)
  
  mask <- (data > upper_limit)  | (data < lower_limit)
  
  return (data[mask])
}

Data Exploration

df <- read.csv("CTG.csv")
# convert the classes to factors
df$CLASS <- as.factor(df$CLASS)
df$NSP <- as.factor(df$NSP)

The data consists of 2126 rows that detail the results of Cardiotocograms (CTGs) and a classification by expert obstetricians (Marques de Sai et al, 2010). For each row, the data contains 23 variables, with the final two being the classifications by experts (Marques de Sai et al, 2010).

str(df)
## 'data.frame':    2126 obs. of  23 variables:
##  $ LB      : int  120 132 133 134 132 134 134 122 122 122 ...
##  $ AC      : num  0 0.006 0.003 0.003 0.007 0.001 0.001 0 0 0 ...
##  $ FM      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ UC      : num  0 0.006 0.008 0.008 0.008 0.01 0.013 0 0.002 0.003 ...
##  $ DL      : num  0 0.003 0.003 0.003 0 0.009 0.008 0 0 0 ...
##  $ DS      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DP      : num  0 0 0 0 0 0.002 0.003 0 0 0 ...
##  $ ASTV    : int  73 17 16 16 16 26 29 83 84 86 ...
##  $ MSTV    : num  0.5 2.1 2.1 2.4 2.4 5.9 6.3 0.5 0.5 0.3 ...
##  $ ALTV    : int  43 0 0 0 0 0 0 6 5 6 ...
##  $ MLTV    : num  2.4 10.4 13.4 23 19.9 0 0 15.6 13.6 10.6 ...
##  $ Width   : int  64 130 130 117 117 150 150 68 68 68 ...
##  $ Min     : int  62 68 68 53 53 50 50 62 62 62 ...
##  $ Max     : int  126 198 198 170 170 200 200 130 130 130 ...
##  $ Nmax    : int  2 6 5 11 9 5 6 0 0 1 ...
##  $ Nzeros  : int  0 1 1 0 0 3 3 0 0 0 ...
##  $ Mode    : int  120 141 141 137 137 76 71 122 122 122 ...
##  $ Mean    : int  137 136 135 134 136 107 107 122 122 122 ...
##  $ Median  : int  121 140 138 137 138 107 106 123 123 123 ...
##  $ Variance: int  73 12 13 13 11 170 215 3 3 1 ...
##  $ Tendency: int  1 0 0 1 1 0 0 1 1 1 ...
##  $ CLASS   : Factor w/ 10 levels "1","2","3","4",..: 9 6 6 6 2 8 8 9 9 9 ...
##  $ NSP     : Factor w/ 3 levels "1","2","3": 2 1 1 1 1 3 3 3 3 3 ...
datatable(df)

LB - FHR baseline (beats per minute)

The LB variable details the fetal heart rate baseline in terms of beats per minute (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$LB
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   106.0   126.0   133.0   133.3   140.0   160.0
generate_histogram(df, col_of_interest,  "Histogram of Fetal Heartbeat Baseline", "Heartbeat Baseline") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Fetal Heartbeat Baseline", "Heartbeat Baseline") 

There were 0 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of LB by Morphologic Pattern (CLASS)", "Heartbeat Baseline")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of LB by Morphologic Pattern (CLASS)", "Heartbeat Baseline")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of LB by Fetal State Class", "Heartbeat Baseline")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of LB by Fetal State Class", "Heartbeat Baseline")

AC - Accelerations per second

The AC variable details the accelerations per second as recorded during the CTG. (Marques de Sai et al, 2010)

Overview

col_of_interest <- df$AC
summary(col_of_interest)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.002000 0.003178 0.006000 0.019000
generate_histogram(df, col_of_interest,  "Histogram of Fetal Accelerations per Second ", "Accelerations per Second ") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Fetal Accelerations per Second", "Accelerations per Second") 

There were 23 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of AC by Morphologic Pattern (CLASS)", "Accelerations per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of AC by Morphologic Pattern (CLASS)", "Accelerations per Second")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of AC by Fetal State Class", "Accelerations per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of AC by Fetal State Class", "Accelerations per Second")

FM - Fetal movements per second

The FM variable details the fetal movements per second (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$FM
summary(col_of_interest)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.009481 0.003000 0.481000
generate_histogram(df, col_of_interest,  "Histogram of Fetal Movements per Second", "Fetal Movements per Second") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Fetal Movements per Second", "Fetal Movements per Second") 

There were 31 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of FM by Morphologic Patternz (CLASS)", "Fetal Movements per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of FM by Morphologic Pattern (CLASS)", "Fetal Movements per Second")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of FM by Fetal State Class", "Fetal Movements per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of FM by Fetal State Class", "Fetal Movements per Second")

UC - Uterine contractions per second

The UC variable details the uterine contractions per second (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$UC
summary(col_of_interest)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.002000 0.004000 0.004366 0.007000 0.015000
generate_histogram(df, col_of_interest,  "Histogram of Uterine Contractions Per Second", "Uterine Contractions Per Second") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Uterine Contractions Per Second", "Uterine Contractions Per Second") 

There were 3 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of UC by Morphologic Pattern (CLASS)", "Uterine Contractions Per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of UC by Morphologic Pattern (CLASS)", "Uterine Contractions Per Second")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of UC by Fetal State Class", "Uterine Contractions Per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of UC by Fetal State Class", "Uterine Contractions Per Second")

DL - Light decelerations per second

The DL variable details light deceleration per second as recorded during the CTG (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$DL
summary(col_of_interest)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.001889 0.003000 0.015000
generate_histogram(df, col_of_interest,  "Histogram of Decelerations per Second", "Decelerations per Second") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Decelerations per Second", "Decelerations per Second") 

There were 43 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of DL by Morphologic Pattern (CLASS)", "Decelerations per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of DL by Morphologic Pattern (CLASS)", "Decelerations per Second")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of DL by Fetal State Class", "Decelerations per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of DL by Fetal State Class", "Decelerations per Second")

DS - Severe decelerations per second

The DS variable details the accelerations per second

Overview

col_of_interest <- df$DS
summary(col_of_interest)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 3.293e-06 0.000e+00 1.000e-03
generate_histogram(df, col_of_interest,  "Histogram of Severe decelerations per second", "Severe decelerations per second") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Severe decelerations per second", "Severe decelerations per second") 

There were 7 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of DS by Morphologic Pattern (CLASS)", "Severe decelerations per second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of DS by Morphologic Pattern (CLASS)", "Severe decelerations per second")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of DS by Fetal State Class", "Severe decelerations per second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of DS by Fetal State Class", "Severe decelerations per second")

DP - Prolongued Decelerations per Second

The DP variable details the prolongued decelerations per second, as recorded during the CTG (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$DP
summary(col_of_interest)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0000000 0.0000000 0.0001585 0.0000000 0.0050000
generate_histogram(df, col_of_interest,  "Histogram of Prolongued Decelerations per Second", "Prolongued Decelerations per Second") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Prolongued Decelerations per Second", "Prolongued Decelerations per Second") 

There were 108 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of DP by Morphologic Pattern (CLASS)", "Prolongued Decelerations per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of DP by Morphologic Pattern (CLASS)", "Prolongued Decelerations per Second")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of DP by Fetal State Class", "Prolongued Decelerations per Second")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of DP by Fetal State Class", "Prolongued Decelerations per Second")

ASTV - Percentage of Time with Abnormal Short Term Variability

The ASTV variable details the percentage of time with abnormal short term variability (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$ASTV
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.00   32.00   49.00   46.99   61.00   87.00
generate_histogram(df, col_of_interest,  "Histogram of Percentage of Time with Abnormal Short Term Variability", "Percentage of Time with Abnormal Short Term Variability") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Percentage of Time with Abnormal Short Term Variability", "Percentage of Time with Abnormal Short Term Variability") 

There were 0 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of ASTV by Morphologic Pattern (CLASS)", "Percentage of Time with Abnormal Short Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of ASTV by Morphologic Pattern (CLASS)", "Percentage of Time with Abnormal Short Term Variability")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of ASTV by Fetal State Class", "Percentage of Time with Abnormal Short Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of ASTV by Fetal State Class", "Percentage of Time with Abnormal Short Term Variability")

MSTV - mean value of short term variability

The MSTV variable details the mean value of short term variability (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$MSTV
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   0.700   1.200   1.333   1.700   7.000
generate_histogram(df, col_of_interest,  "Histogram of Mean Short Term Variability", "Mean Short Term Variability") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Mean Short Term Variability", "Mean Short Term Variability") 

There were 33 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of MSTV by Morphologic Pattern (CLASS)", "Mean Short Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of MSTV by Morphologic Pattern (CLASS)", "Mean Short Term Variability")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of MSTV by Fetal State Class", "Mean Short Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of MSTV by Fetal State Class", "Mean Short Term Variability")

ALTV - percentage of time with abnormal long term variability

The ALTV variable details the percentage of time with abnoramal long term variability (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$ALTV
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   9.847  11.000  91.000
generate_histogram(df, col_of_interest,  "Histogram of Percentage of Time with Abnormal Long Term Variability", "Percentage of Time with Abnormal Long Term Variability") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Percentage of Time with Abnormal Long Term Variability", "Percentage of Time with Abnormal Long Term Variability") 

There were 59 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of ALTV by Morphologic Pattern (CLASS)", "Percentage of Time with Abnormal Long Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of ALTV by Morphologic Pattern (CLASS)", "Percentage of Time with Abnormal Long Term Variability")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of ALTV by Fetal State Class", "Percentage of Time with Abnormal Long Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of ALTV by Fetal State Class", "Percentage of Time with Abnormal Long Term Variability")

MLTV - mean value of long term variability

The MLTV variable details the mean value of long term variability (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$MLTV
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.600   7.400   8.188  10.800  50.700
generate_histogram(df, col_of_interest,  "Histogram of Mean  Long Term Variability", "Mean  Long Term Variability") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Mean  Long Term Variability", "Mean  Long Term Variability") 

There were 33 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of MLTV by Morphologic Pattern (CLASS)", "Mean  Long Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of MLTV by Morphologic Pattern (CLASS)", "Mean  Long Term Variability")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of MLTV by Fetal State Class", "Mean  Long Term Variability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of MLTV by Fetal State Class", "Mean  Long Term Variability")

Width - width of FHR histogram

The Width variable details the width of the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Width
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   37.00   67.50   70.45  100.00  180.00
generate_histogram(df, col_of_interest,  "Histogram of Width of FHR Histogram", "Width of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Width of FHR Histogram", "Width of FHR Histogram") 

There were 0 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Width by Morphologic Pattern (CLASS)", "Width of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Width by Morphologic Pattern (CLASS)", "Width of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Width by Fetal State Class", "Width of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Width by Fetal State Class", "Width of FHR Histogram")

Min - minimum of FHR histogram

The Min variable details the minimum in the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Min
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.00   67.00   93.00   93.58  120.00  159.00
generate_histogram(df, col_of_interest,  "Histogram of Min of FHR Histogram", "Min of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Min of FHR Histogram", "Min of FHR Histogram") 

There were 0 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Min by Morphologic Pattern (CLASS)", "Min of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Min by Morphologic Pattern (CLASS)", "Min of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Min by Fetal State Class", "Min of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Min by Fetal State Class", "Min of FHR Histogram")

Max - Maximum of FHR histogram

The Max variable details the maximum in the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Max
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     122     152     162     164     174     238
generate_histogram(df, col_of_interest,  "Histogram of Max of FHR Histogram", "Max of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Max of FHR Histogram", "Max of FHR Histogram") 

There were 14 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Max by Morphologic Pattern (CLASS)", "Max of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Max by Morphologic Pattern (CLASS)", "Max of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Max by Fetal State Class", "Max of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Max by Fetal State Class", "Max of FHR Histogram")

Nmax - # of histogram peaks

The Nmax variable details the number of peaks in the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Nmax
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   4.068   6.000  18.000
generate_histogram(df, col_of_interest,  "Histogram of Number of Histogram Peaks", "Number of Histogram Peaks") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Number of Histogram Peaks", "Number of Histogram Peaks") 

There were 19 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Nmax by Morphologic Pattern (CLASS)", "Number of Histogram Peaks")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Nmax by Morphologic Pattern (CLASS)", "Number of Histogram Peaks")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Nmax by Fetal State Class", "Number of Histogram Peaks")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Nmax by Fetal State Class", "Number of Histogram Peaks")

Nzeros - # of histogram zeros

The Nzeros variable details the number of zeros in the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Nzeros
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3236  0.0000 10.0000
generate_histogram(df, col_of_interest,  "Histogram of Number of Histogram Zeros", "Number of Histogram Zeros") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Number of Histogram Zeros", "Number of Histogram Zeros") 

There were 28 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Nzeros by Morphologic Pattern (CLASS)", "Number of Histogram Zeros")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Nzeros by Morphologic Pattern (CLASS)", "Number of Histogram Zeros")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Nzeros by Fetal State Class", "Number of Histogram Zeros")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Nzeros by Fetal State Class", "Number of Histogram Zeros")

Mode - histogram mode

The Mode variable details the mode of the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Mode 
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    60.0   129.0   139.0   137.5   148.0   187.0
generate_histogram(df, col_of_interest,  "Histogram of Mode of FHR Histogram", "Mode of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Mode of FHR Histogram", "Mode of FHR Histogram") 

There were 39 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Mode  by Morphologic Pattern (CLASS)", "Mode of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Mode  by Morphologic Pattern (CLASS)", "Mode of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Mode  by Fetal State Class", "Mode of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Mode  by Fetal State Class", "Mode of FHR Histogram")

Mean - histogram mean

The Mean variable details the mean of the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Mean 
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    73.0   125.0   136.0   134.6   145.0   182.0
generate_histogram(df, col_of_interest,  "Histogram of Mean of FHR Histogram", "Mean of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Mean of FHR Histogram", "Mean of FHR Histogram") 

There were 26 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Mean  by Morphologic Pattern (CLASS)", "Mean of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Mean  by Morphologic Pattern (CLASS)", "Mean of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Mean  by Fetal State Class", "Mean of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Mean  by Fetal State Class", "Mean of FHR Histogram")

Median - histogram median

The Median variable details the median of the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Median 
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    77.0   129.0   139.0   138.1   148.0   186.0
generate_histogram(df, col_of_interest,  "Histogram of Median of FHR Histogram", "Median of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Median of FHR Histogram", "Median of FHR Histogram") 

There were 16 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Median  by Morphologic Pattern (CLASS)", "Median of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Median  by Morphologic Pattern (CLASS)", "Median of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Median  by Fetal State Class", "Median of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Median  by Fetal State Class", "Median of FHR Histogram")

Variance - histogram variance

The Variance variable details the variance of the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Variance 
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    2.00    7.00   18.81   24.00  269.00
generate_histogram(df, col_of_interest,  "Histogram of Variance of FHR Histogram", "Variance of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Variance of FHR Histogram", "Variance of FHR Histogram") 

There were 44 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Variance  by Morphologic Pattern (CLASS)", "Variance of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Variance  by Morphologic Pattern (CLASS)", "Variance of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Variance  by Fetal State Class", "Variance of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Variance  by Fetal State Class", "Variance of FHR Histogram")

Tendency - histogram tendency

The Tendency variable details the tendency of the CTG heart rate histogram (Marques de Sai et al, 2010).

Overview

col_of_interest <- df$Tendency 
summary(col_of_interest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0000  0.0000  0.0000  0.3203  1.0000  1.0000
generate_histogram(df, col_of_interest,  "Histogram of Tendency of FHR Histogram", "Tendency of FHR Histogram") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_boxplot(df, col_of_interest,  "Histogram of Tendency of FHR Histogram", "Tendency of FHR Histogram") 

There were 0 outliers for this column.

Facet By Morphologic Pattern (CLASS)

generate_class_histogram_facet(df, col_of_interest, "Hisogram of Tendency  by Morphologic Pattern (CLASS)", "Tendency of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_class_boxplot_facet(df, col_of_interest, "Boxplot of Tendency  by Morphologic Pattern (CLASS)", "Tendency of FHR Histogram")

Facet By Fetal State Class (NSP)

generate_nsp_histogram_facet(df, col_of_interest, "Histogram of Tendency  by Fetal State Class", "Tendency of FHR Histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

generate_nsp_boxplot_facet(df, col_of_interest, "Boxplot of Tendency  by Fetal State Class", "Tendency of FHR Histogram")

CLASS - FHR pattern class code (1 to 10)

table(df$CLASS)
## 
##   1   2   3   4   5   6   7   8   9  10 
## 384 579  53  81  72 332 252 107  69 197

NSP - fetal state class code (N=normal; S=suspect; P=pathologic)

table(df$NSP)
## 
##    1    2    3 
## 1655  295  176

Data Loading

Load the data for modeling. During exploratory analysis, it was discovered that the classifications for NSP are not balanced. The initial_split function from rsample in tidymodels is used to split the data into training and test sets, while preserving the distribution of the NSP variable (Silge).

df <- read.csv("CTG.csv")
# convert the classes to factors
df$CLASS <- NULL
df$NSP <- as.factor(df$NSP)
data_partitioned <- initial_split(df, prop = 0.75, strata = NSP)
train <- training(data_partitioned)
test <-  testing(data_partitioned)

Tree Creation

First Attempt

Model Definition

The first attempt involved using all of the available features and not specifying any limitations to the model. The tree is created using the parsnip package in tidymodels, with the rpart engine and set for classification (Kuhn). The tree was fit on the data, using all of the available columns

tree_template <- decision_tree() %>% set_engine("rpart") %>% set_mode("classification")
tree_model <- tree_template %>% fit(formula = NSP ~ ., data =  train)
fancyRpartPlot(tree_model$fit, caption = "First Decision Tree Attempt")

Model Results

To assess the performance of the tree, the accuracy, confusion matrix, ROC Curve, and AUC are all captured (Han et al, 2011, p. 49).

# For the confusion Matrix
predictions <- predict(tree_model, test) %>% mutate(true = test$NSP)
# For the plot of the ROC Curve
predictions_prob <- predict(tree_model, test, type = "prob") %>% bind_cols(test)
accuracy(data = predictions, estimate = .pred_class, truth = true)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.917
conf_mat(data = predictions, estimate = .pred_class, truth = true)
##           Truth
## Prediction   1   2   3
##          1 396  24   1
##          2  11  52   1
##          3   4   3  40
autoplot(roc_curve(data = predictions_prob, estimate = c(.pred_1, .pred_2, .pred_3), truth = NSP)) + ggtitle("Multi Class ROC Curves For Model 1")

print(roc_auc(data = predictions_prob, estimate = c(.pred_1, .pred_2, .pred_3), truth = NSP))
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc hand_till      0.926

Note: The article written by Brendan Cullen (2021) here helped a bit with using features available in the collection packages found in tidymodels.

Second Attempt

Optimization

The second attempt is to use a grid search to find the optimal combination of min_n, tree depth, and cost complexity for the model using tune_grid from the tidymodels set of packages (Kuhn).

tune_specification <- decision_tree(tree_depth = tune(), min_n = tune(), cost_complexity = tune()) %>% set_mode("classification") %>% set_engine("rpart")

grid_search <- grid_regular(parameters(tune_specification), levels = 10)
## Warning: `parameters.model_spec()` was deprecated in tune 0.1.6.9003.
## Please use `hardhat::extract_parameter_set_dials()` instead.
tuned <- tune_grid(tune_specification, NSP ~ ., resample = vfold_cv(train, v = 3), grid = grid_search, metrics = metric_set(accuracy))

autoplot(tuned)

Using Best Parameters

Following the grid search, the best performing set of parameters were saved and used to create a second model.

optimal_parameters <- select_best(tuned)
print(optimal_parameters)
## # A tibble: 1 × 4
##   cost_complexity tree_depth min_n .config                
##             <dbl>      <int> <int> <chr>                  
## 1    0.0000000001          5    14 Preprocessor1_Model0331
optimal_tree_specification <- finalize_model(tune_specification, optimal_parameters)

optimal_model <- fit(optimal_tree_specification,
                   NSP ~ .,
                   train)

fancyRpartPlot(optimal_model$fit, caption = "Final Decision Tree Attempt")

Model Performance

To assess the performance of the tree, the accuracy, confusion matrix, ROC Curve, and AUC are all captured (Han et al, 2011, p. 49).

predictions <- predict(optimal_model, test) %>% mutate(true = test$NSP)
predictions_prob <- predict(optimal_model, test, type = "prob") %>% bind_cols(test)
accuracy(data = predictions, estimate = .pred_class, truth = true)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.912
conf_mat(data = predictions, estimate = .pred_class, truth = true)
##           Truth
## Prediction   1   2   3
##          1 400  27   5
##          2   7  49   1
##          3   4   3  36
autoplot(roc_curve(data = predictions_prob, estimate = c(.pred_1, .pred_2, .pred_3), truth = NSP)) + ggtitle("Multi Class ROC Curves For Final Model")

print(roc_auc(data = predictions_prob, estimate = c(.pred_1, .pred_2, .pred_3), truth = NSP))
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc hand_till      0.932

References

Han, Kamber, & Pei. (2011). Chapter 8. Classification: Basic Concepts. Elsevier Science. Kuhn, M. (n.d.) Model Tuning Via Grid Search Retrieved from: https://tune.tidymodels.org/reference/tune_grid.html Kuhn, M. (n.d.) Decision Trees. Retrieved from: https://parsnip.tidymodels.org/reference/decision_tree.html Marques de Saj, J., Bernardes, J,, & Aryres de Campos, D. (2010). Cardiotocography Data Set [CSV]. Retrieved from: https://archive.ics.uci.edu/ml/datasets/Cardiotocography# Silge, J. (n.d). Simple Training/Test Set Splitting. Retrieved from: https://rsample.tidymodels.org/reference/initial_split.html